First we have to fetch the data. The source used here is https://covidtracking.com. The provide an API for fetching data by state. In the following data chunk, we end up with a list of lists. The number of measurements is shown along with the structure of a measurement.

library(httr)
req <- GET("https://covidtracking.com/api/v1/states/daily.json")

data <- content(req, as = "parsed")
length(data)
## [1] 11186
str(data[[1]])
## List of 54
##  $ date                       : int 20200919
##  $ state                      : chr "AK"
##  $ positive                   : int 7674
##  $ negative                   : int 415176
##  $ pending                    : NULL
##  $ totalTestResults           : int 422850
##  $ hospitalizedCurrently      : int 41
##  $ hospitalizedCumulative     : NULL
##  $ inIcuCurrently             : NULL
##  $ inIcuCumulative            : NULL
##  $ onVentilatorCurrently      : int 13
##  $ onVentilatorCumulative     : NULL
##  $ recovered                  : int 2438
##  $ dataQualityGrade           : chr "A"
##  $ lastUpdateEt               : chr "9/19/2020 03:59"
##  $ dateModified               : chr "2020-09-19T03:59:00Z"
##  $ checkTimeEt                : chr "09/18 23:59"
##  $ death                      : int 45
##  $ hospitalized               : NULL
##  $ dateChecked                : chr "2020-09-19T03:59:00Z"
##  $ totalTestsViral            : int 422850
##  $ positiveTestsViral         : int 6970
##  $ negativeTestsViral         : int 415600
##  $ positiveCasesViral         : int 7674
##  $ deathConfirmed             : int 45
##  $ deathProbable              : NULL
##  $ totalTestEncountersViral   : NULL
##  $ totalTestsPeopleViral      : NULL
##  $ totalTestsAntibody         : NULL
##  $ positiveTestsAntibody      : NULL
##  $ negativeTestsAntibody      : NULL
##  $ totalTestsPeopleAntibody   : NULL
##  $ positiveTestsPeopleAntibody: NULL
##  $ negativeTestsPeopleAntibody: NULL
##  $ totalTestsPeopleAntigen    : NULL
##  $ positiveTestsPeopleAntigen : NULL
##  $ totalTestsAntigen          : NULL
##  $ positiveTestsAntigen       : NULL
##  $ fips                       : chr "02"
##  $ positiveIncrease           : int 87
##  $ negativeIncrease           : int 4470
##  $ total                      : int 422850
##  $ totalTestResultsSource     : chr "posNeg"
##  $ totalTestResultsIncrease   : int 4557
##  $ posNeg                     : int 422850
##  $ deathIncrease              : int 0
##  $ hospitalizedIncrease       : int 0
##  $ hash                       : chr "934611e27c6c701c808cf53976fe9a48c80fd513"
##  $ commercialScore            : int 0
##  $ negativeRegularScore       : int 0
##  $ negativeScore              : int 0
##  $ positiveScore              : int 0
##  $ score                      : int 0
##  $ grade                      : chr ""
library(httr)
library(data.table)
info_req <- GET("https://covidtracking.com/api/v1/states/info.json")
info <- content(info_req, as = "parsed")
length(info)
## [1] 56
str(info[[1]])
## List of 14
##  $ state                                      : chr "AK"
##  $ notes                                      : chr "Alaska combines PCR and antigen tests in the total tests figure reported on the state's dashboard.\n\nAlaska re"| __truncated__
##  $ covid19Site                                : chr "http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/monitoring.aspx"
##  $ covid19SiteSecondary                       : chr "https://experience.arcgis.com/experience/ed1c874ca60b4c15ab09095a070065ca"
##  $ covid19SiteTertiary                        : chr "https://alaska-dhss.maps.arcgis.com/apps/opsdashboard/index.html#/8782a14ef52342e99f866a3b8a3e624a"
##  $ twitter                                    : chr "@Alaska_DHSS"
##  $ covid19SiteOld                             : chr "http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx"
##  $ covidTrackingProjectPreferredTotalTestUnits: chr "Specimens"
##  $ covidTrackingProjectPreferredTotalTestField: chr "totalTestsViral"
##  $ totalTestResultsField                      : chr "Total Tests (PCR)"
##  $ name                                       : chr "Alaska"
##  $ fips                                       : chr "02"
##  $ pui                                        : chr ""
##  $ pum                                        : logi FALSE
state_info <- rbindlist(as.list(info), fill=TRUE)


get_state_name <- function(abbr) {
  state_info[state_info$state == abbr]$name
}
library(data.table)
library(ggplot2)
dt <- rbindlist(data, fill=TRUE)
## Warning in rbindlist(data, fill = TRUE): Column 5 ['pending'] of item 1 is
## length 0. This (and 225969 others like it) has been filled with NA (NULL for
## list columns) to make each item uniform.
dt$date <- as.Date(as.character(dt$date), format = "%Y%m%d")
dt$dateChecked <- as.POSIXct(dt$dateChecked, "%Y-%m-%dT%H:%M:%S", tz="UTC")
dt$state <- as.factor(dt$state)
dt$fips <- as.factor(dt$fips)
dt <- dt[, -"hash"]

str(dt[1,])
## Classes 'data.table' and 'data.frame':   1 obs. of  53 variables:
##  $ date                       : Date, format: "2020-09-19"
##  $ state                      : Factor w/ 56 levels "AK","AL","AR",..: 1
##  $ positive                   : int 7674
##  $ negative                   : int 415176
##  $ pending                    : int NA
##  $ totalTestResults           : int 422850
##  $ hospitalizedCurrently      : int 41
##  $ hospitalizedCumulative     : int NA
##  $ inIcuCurrently             : int NA
##  $ inIcuCumulative            : int NA
##  $ onVentilatorCurrently      : int 13
##  $ onVentilatorCumulative     : int NA
##  $ recovered                  : int 2438
##  $ dataQualityGrade           : chr "A"
##  $ lastUpdateEt               : chr "9/19/2020 03:59"
##  $ dateModified               : chr "2020-09-19T03:59:00Z"
##  $ checkTimeEt                : chr "09/18 23:59"
##  $ death                      : int 45
##  $ hospitalized               : int NA
##  $ dateChecked                : POSIXct, format: "2020-09-19 03:59:00"
##  $ totalTestsViral            : int 422850
##  $ positiveTestsViral         : int 6970
##  $ negativeTestsViral         : int 415600
##  $ positiveCasesViral         : int 7674
##  $ deathConfirmed             : int 45
##  $ deathProbable              : int NA
##  $ totalTestEncountersViral   : num NA
##  $ totalTestsPeopleViral      : int NA
##  $ totalTestsAntibody         : int NA
##  $ positiveTestsAntibody      : int NA
##  $ negativeTestsAntibody      : int NA
##  $ totalTestsPeopleAntibody   : int NA
##  $ positiveTestsPeopleAntibody: int NA
##  $ negativeTestsPeopleAntibody: int NA
##  $ totalTestsPeopleAntigen    : int NA
##  $ positiveTestsPeopleAntigen : int NA
##  $ totalTestsAntigen          : int NA
##  $ positiveTestsAntigen       : int NA
##  $ fips                       : Factor w/ 56 levels "01","02","04",..: 2
##  $ positiveIncrease           : int 87
##  $ negativeIncrease           : int 4470
##  $ total                      : int 422850
##  $ totalTestResultsSource     : chr "posNeg"
##  $ totalTestResultsIncrease   : int 4557
##  $ posNeg                     : int 422850
##  $ deathIncrease              : int 0
##  $ hospitalizedIncrease       : int 0
##  $ commercialScore            : int 0
##  $ negativeRegularScore       : int 0
##  $ negativeScore              : int 0
##  $ positiveScore              : int 0
##  $ score                      : int 0
##  $ grade                      : chr ""
##  - attr(*, ".internal.selfref")=<externalptr>
picked <- dt[, c("state", "positive", "negative", "recovered","death", "total")]
pairs(picked)

normalized <- cbind.data.frame(dt$positive/dt$totalTestResults, dt$negative/dt$totalTestResults, dt$recovered/dt$total, dt$death/dt$total)
colnames(normalized) <- c("pos", "neg", "recovered", "dead")

pairs(normalized)

library(ggplot2)
library(cowplot)

plot_list <- list()

for(s in levels(dt$state)) {

  by_state <- dt[dt$state == s,]
  date_order <- order(by_state$date)
  
  by_state_ordered_by_date <- by_state[date_order,]
  # to find the earliest meausrement, restrict by complete cases
  # the date of element 0.
  earliest_death_report <- na.omit(by_state_ordered_by_date[,c("date","death")])[1,]
  earliest_death <- by_state_ordered_by_date[by_state_ordered_by_date$death > 0,c("date","death")][1,]
print(data.frame(state = s, earliest_report = earliest_death_report$date, earliest_death = earliest_death$date, earliest_number =earliest_death$death ))

  setnafill(by_state_ordered_by_date,type="locf", cols=c("death"))
  setnafill(by_state_ordered_by_date,type="const", fill=0, cols=c("death"))
 
  integral_breaks <- function(values) {
    breaks <- unique(floor(pretty(seq(0, 1.1 *  max(values) + 1))))
    return(breaks)
  }
  y_breaks <- integral_breaks(
    by_state_ordered_by_date$death
  )
  
  p <- ggplot(by_state_ordered_by_date, aes(x=date, y=death)) +
            geom_point(shape=1) +
            scale_y_continuous(breaks = y_breaks)

            # ggtitle(get_state_name(s))

  plot_list[[s]] <- p

}
##   state earliest_report earliest_death earliest_number
## 1    AK      2020-03-06     2020-03-25               1
##   state earliest_report earliest_death earliest_number
## 1    AL      2020-03-15     2020-03-26               1
##   state earliest_report earliest_death earliest_number
## 1    AR      2020-03-22     2020-03-25               2
##   state earliest_report earliest_death earliest_number
## 1    AS      2020-03-18           <NA>              NA
##   state earliest_report earliest_death earliest_number
## 1    AZ      2020-03-13     2020-03-21               1
##   state earliest_report earliest_death earliest_number
## 1    CA      2020-03-12     2020-03-12               4
##   state earliest_report earliest_death earliest_number
## 1    CO      2020-03-14     2020-03-14               1
##   state earliest_report earliest_death earliest_number
## 1    CT      2020-03-19     2020-03-19               1
##   state earliest_report earliest_death earliest_number
## 1    DC      2020-03-19     2020-03-20               1
##   state earliest_report earliest_death earliest_number
## 1    DE      2020-03-08     2020-03-24               1
##   state earliest_report earliest_death earliest_number
## 1    FL      2020-03-11     2020-03-11               2
##   state earliest_report earliest_death earliest_number
## 1    GA      2020-03-13     2020-03-13               1
##   state earliest_report earliest_death earliest_number
## 1    GU      2020-03-22     2020-03-22               1
##   state earliest_report earliest_death earliest_number
## 1    HI      2020-04-01     2020-04-01               1
##   state earliest_report earliest_death earliest_number
## 1    IA      2020-03-25     2020-03-25               1
##   state earliest_report earliest_death earliest_number
## 1    ID      2020-03-22     2020-03-27               3
##   state earliest_report earliest_death earliest_number
## 1    IL      2020-03-17     2020-03-17               1
##   state earliest_report earliest_death earliest_number
## 1    IN      2020-03-14     2020-03-16               1
##   state earliest_report earliest_death earliest_number
## 1    KS      2020-03-14     2020-03-14               1
##   state earliest_report earliest_death earliest_number
## 1    KY      2020-03-14     2020-03-14               1
##   state earliest_report earliest_death earliest_number
## 1    LA      2020-03-15     2020-03-15               2
##   state earliest_report earliest_death earliest_number
## 1    MA      2020-03-18     2020-03-18               2
##   state earliest_report earliest_death earliest_number
## 1    MD      2020-03-16     2020-03-16               2
##   state earliest_report earliest_death earliest_number
## 1    ME      2020-03-27     2020-03-27               1
##   state earliest_report earliest_death earliest_number
## 1    MI      2020-03-17     2020-03-17               2
##   state earliest_report earliest_death earliest_number
## 1    MN      2020-03-21     2020-03-21               1
##   state earliest_report earliest_death earliest_number
## 1    MO      2020-03-14     2020-03-19               1
##   state earliest_report earliest_death earliest_number
## 1    MP      2020-03-18     2020-04-02               1
##   state earliest_report earliest_death earliest_number
## 1    MS      2020-03-20     2020-03-20               1
##   state earliest_report earliest_death earliest_number
## 1    MT      2020-03-28     2020-03-28               1
##   state earliest_report earliest_death earliest_number
## 1    NC      2020-03-16     2020-03-25               1
##   state earliest_report earliest_death earliest_number
## 1    ND      2020-03-13     2020-03-28               1
##   state earliest_report earliest_death earliest_number
## 1    NE      2020-03-19     2020-03-28               2
##   state earliest_report earliest_death earliest_number
## 1    NH      2020-03-24     2020-03-24               1
##   state earliest_report earliest_death earliest_number
## 1    NJ      2020-02-10     2020-03-11               1
##   state earliest_report earliest_death earliest_number
## 1    NM      2020-03-25     2020-03-25               1
##   state earliest_report earliest_death earliest_number
## 1    NV      2020-03-05     2020-03-15               1
##   state earliest_report earliest_death earliest_number
## 1    NY      2020-03-15     2020-03-15               3
##   state earliest_report earliest_death earliest_number
## 1    OH      2020-03-20     2020-03-20               1
##   state earliest_report earliest_death earliest_number
## 1    OK      2020-03-19     2020-03-19               1
##   state earliest_report earliest_death earliest_number
## 1    OR      2020-03-18     2020-03-18               3
##   state earliest_report earliest_death earliest_number
## 1    PA      2020-03-18     2020-03-18               1
##   state earliest_report earliest_death earliest_number
## 1    PR      2020-03-22     2020-03-22               1
##   state earliest_report earliest_death earliest_number
## 1    RI      2020-03-29     2020-03-29               3
##   state earliest_report earliest_death earliest_number
## 1    SC      2020-03-16     2020-03-16               1
##   state earliest_report earliest_death earliest_number
## 1    SD      2020-03-18     2020-03-18               1
##   state earliest_report earliest_death earliest_number
## 1    TN      2020-03-23     2020-03-23               2
##   state earliest_report earliest_death earliest_number
## 1    TX      2020-03-17     2020-03-17               1
##   state earliest_report earliest_death earliest_number
## 1    UT      2020-03-15     2020-03-22               1
##   state earliest_report earliest_death earliest_number
## 1    VA      2020-03-15     2020-03-15               1
##   state earliest_report earliest_death earliest_number
## 1    VI      2020-04-05     2020-04-05               1
##   state earliest_report earliest_death earliest_number
## 1    VT      2020-03-20     2020-03-20               2
##   state earliest_report earliest_death earliest_number
## 1    WA      2020-02-26     2020-02-26               2
##   state earliest_report earliest_death earliest_number
## 1    WI      2020-03-20     2020-03-20               3
##   state earliest_report earliest_death earliest_number
## 1    WV      2020-03-15     2020-03-30               1
##   state earliest_report earliest_death earliest_number
## 1    WY      2020-03-27     2020-04-13               1
step = 4
for (i in seq(from = 1, to = length(plot_list), by = step)) {

  group <- plot_list[i:(i+step-1)]
  #print(str(group))
  print(plot_grid(plotlist = group, ncol = 2,labels = names(group)))
}

Now let’s look at overall population stats

pop <- GET("https://api.census.gov/data/2019/pep/population?get=DATE_CODE,POP,NAME&for=STATE:*")
popdata <- content(pop, as = "parsed")

dt2 <- rbindlist(popdata[2:length(popdata)])
colnames(dt2) <- unlist(popdata[[1]])

dt2 <- dt2[dt2$DATE_CODE == 12,-c("DATE_CODE")] 

dt2$POP <- as.numeric(dt2$POP)
dt2$state <- as.factor(dt2$state)
names(dt2)[names(dt2) == "state"] <- "fips"
dt2[order(dt2$NAME),]
ggplot(dt2, aes(x=NAME,y=POP), fill=NAME) + geom_bar(stat="identity")

dt3 <- merge(dt,dt2, by="fips")

Now we can calculate deaths per million of population.

dt3[["dpm"]] <- dt3$death / dt3$POP * 1e6
dt3

Now lets, foe each fips code, find the earliest day where dpm is >= 1 and then what is the dpm 21 days after that point.

state_df <- data.frame(
  state=character(0),
  location_name=character(0),
  date1m=character(0),
  dpm21d=numeric(0)
  )
str(state_df)
## 'data.frame':    0 obs. of  4 variables:
##  $ state        : chr 
##  $ location_name: chr 
##  $ date1m       : chr 
##  $ dpm21d       : num
for (code in levels(dt3$fips)) {
  if (length(dt3[dt3$fips == code,]$date) == 0) {
    break
  }
  print(code)
  #print(dt3[dt3$fips == code,]$date)

  by_state <- dt3[dt3$fips == code,]
  date_order <- order(by_state$date)
  by_state_ordered_by_date <- by_state[date_order,]
  a <- setnafill(by_state_ordered_by_date,type="locf", cols=c("dpm"))
  b <- setnafill(by_state_ordered_by_date,type="const", fill=0, cols=c("dpm"))

  k <- Position(function(x) x > 1, by_state_ordered_by_date$dpm)
  k21 <- k + 21
  print(by_state_ordered_by_date[k,]$date)
  obs <- list(
  state=by_state_ordered_by_date[k,]$state,
  location_name=by_state_ordered_by_date[k,]$NAME,
  date1m=as.character(by_state_ordered_by_date[k,]$date),
  dpm21d= by_state_ordered_by_date[k21,]$dpm
  )
  state_df <- rbind(state_df,obs)
}
## [1] "01"
## [1] "2020-03-30"
## [1] "02"
## [1] "2020-03-25"
## [1] "04"
## [1] "2020-03-26"
## [1] "05"
## [1] "2020-03-28"
## [1] "06"
## [1] "2020-03-24"
## [1] "08"
## [1] "2020-03-23"
## [1] "09"
## [1] "2020-03-21"
## [1] "10"
## [1] "2020-03-24"
## [1] "11"
## [1] "2020-03-20"
## [1] "12"
## [1] "2020-03-25"
## [1] "13"
## [1] "2020-03-20"
## [1] "15"
## [1] "2020-04-03"
## [1] "16"
## [1] "2020-03-27"
## [1] "17"
## [1] "2020-03-24"
## [1] "18"
## [1] "2020-03-23"
## [1] "19"
## [1] "2020-03-29"
## [1] "20"
## [1] "2020-03-25"
## [1] "21"
## [1] "2020-03-27"
## [1] "22"
## [1] "2020-03-18"
## [1] "23"
## [1] "2020-03-29"
## [1] "24"
## [1] "2020-03-26"
## [1] "25"
## [1] "2020-03-21"
## [1] "26"
## [1] "2020-03-19"
## [1] "27"
## [1] "2020-03-29"
## [1] "28"
## [1] "2020-03-26"
## [1] "29"
## [1] "2020-03-25"
## [1] "30"
## [1] "2020-03-30"
## [1] "31"
## [1] "2020-03-28"
## [1] "32"
## [1] "2020-03-21"
## [1] "33"
## [1] "2020-03-28"
## [1] "34"
## [1] "2020-03-19"
## [1] "35"
## [1] "2020-03-31"
## [1] "36"
## [1] "2020-03-20"
## [1] "37"
## [1] "2020-04-02"
## [1] "38"
## [1] "2020-03-28"
## [1] "39"
## [1] "2020-03-26"
## [1] "40"
## [1] "2020-03-25"
## [1] "41"
## [1] "2020-03-23"
## [1] "42"
## [1] "2020-03-26"
## [1] "44"
## [1] "2020-03-29"
## [1] "45"
## [1] "2020-03-25"
## [1] "46"
## [1] "2020-03-18"
## [1] "47"
## [1] "2020-03-29"
## [1] "48"
## [1] "2020-03-29"
## [1] "49"
## [1] "2020-03-30"
## [1] "50"
## [1] "2020-03-20"
## [1] "51"
## [1] "2020-03-25"
## [1] "53"
## [1] "2020-03-01"
## [1] "54"
## [1] "2020-04-02"
## [1] "55"
## [1] "2020-03-25"
## [1] "56"
## [1] "2020-04-13"
state_df <- state_df[complete.cases(state_df),]
state_dt <- data.table(state_df)
state_dt
data_dir <- paste(getwd(), "data", sep='/')
temp <- tempfile()
imhe_source <- "https://ihmecovid19storage.blob.core.windows.net/latest/ihme-covid19.zip"
download.file(imhe_source, temp, mode="wb")

data_file <- paste(data_dir, "Summary_stats_all_locs.csv", sep='/')
unzip(temp, exdir=data_dir, junkpaths=TRUE)
dd <- read.table(data_file, sep=",", header=T)
unlink(temp)
#dd_ss <- dd[,c("location_name", "any_business_start_date","all_non.ess_business_start_date")]
dd_ss <- dd[,c("location_name", "any_business_start_date")]
dd_ss <- dd
state_dt2 <- merge(state_dt, dd_ss, by="location_name")
#head(by_state_ordered_by_date)
state_dt2$days_ld <- as.numeric(as.Date(state_dt2$any_business_start_date, '%Y-%m-%d') - as.Date(state_dt2$date1m))
state_dt2
ggplot(state_dt2, aes(x=days_ld, y=dpm21d)) + geom_point(shape=1) 
## Warning: Removed 1 rows containing missing values (geom_point).